HW4¶

Mu Jin

Q1¶

In [2]:
## import library
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import sqlite3
In [3]:
# Load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
    "modify"   : "roi", 
    "modify.1" : "level4",
    "modify.2" : "level3", 
    "modify.3" : "level2",
    "modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()
# load in the subject data
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]
# Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()
Out[3]:
roi volume level4 level3 level2 level1 icv comp
0 SFG_L 12926 SFG_L Frontal_L CerebralCortex_L Telencephalon_L ICV 0.009350
1 SFG_R 10050 SFG_R Frontal_R CerebralCortex_R Telencephalon_R ICV 0.007270
2 SFG_PFC_L 12783 SFG_L Frontal_L CerebralCortex_L Telencephalon_L ICV 0.009247
3 SFG_PFC_R 11507 SFG_R Frontal_R CerebralCortex_R Telencephalon_R ICV 0.008324
4 SFG_pole_L 3078 SFG_L Frontal_L CerebralCortex_L Telencephalon_L ICV 0.002227
In [4]:
# Create seperate tables that group by icv&level1, level1&level2, level2&level3, level3&level4
dat_icvl1 = subjectData.drop(['roi','volume','level4', 'level3', 'level2'],\
                     axis = 1)
dat_l1l2 = subjectData.drop(['roi','volume','level4', 'level3', 'icv'],\
                     axis = 1)
dat_l2l3 = subjectData.drop(['roi','volume','level4', 'level1','icv'],\
                     axis = 1)
dat_icvl1.head()
dat_l1l2.head()
dat_l2l3.head()
Out[4]:
level3 level2 comp
0 Frontal_L CerebralCortex_L 0.009350
1 Frontal_R CerebralCortex_R 0.007270
2 Frontal_L CerebralCortex_L 0.009247
3 Frontal_R CerebralCortex_R 0.008324
4 Frontal_L CerebralCortex_L 0.002227
In [5]:
# create Sankey diagram
t_list = [('icv','level1'),('level1','level2'),('level2','level3')]
def df_sankey(df, cols_tuple_list):
    s = pd.DataFrame([])
    for t in cols_tuple_list: 
        s1 = df.groupby(by=[t[0],t[1]],axis=0).count() 
        s1 = s1.iloc[:,[0]]
        s1.columns = ['value'] 
        if s.shape[0]== 0:
            s = s1
        else:
            s = pd.concat([s,s1],axis=0) 
    s.reset_index(inplace=True)
    s.columns = ['source','target','value'] 
    label_set = set(s['source'].unique()) | set(s['target'].unique()) 
    labels = {v: k for k, v in enumerate(label_set)}
    s.replace(labels, inplace=True) 
    return s,list(label_set)

s,labels = df_sankey(subjectData[['icv','level1','level2','level3']],t_list)

fig = go.Figure(data=[go.Sankey(
   node = dict(
     pad = 6,
     thickness = 20,
     line = dict(color = "black", width = 1),
     label = labels,
   ),
   link = dict(
     source = s['source'].values,
     target = s['target'].values,
     value = s['value'].values
))])
fig.add_annotation(x=0, y=0.2, text="ICV", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.add_annotation(x=0.34, y=0.2, text="Level1", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.add_annotation(x=0.7, y=0.1, text="Level2", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.add_annotation(x=1, y=0.05, text="Level3", showarrow=False, yshift=-20, font=dict(size=12, color="black"))
fig.update_layout(title_text="Sankey", font_size=10, width=1000, height=1000)
fig.show()
#fig.write_html("/Users/mujin/Library/CloudStorage/OneDrive-JohnsHopkins/academic file/ScM/Term 7/ds/Assignment 4/q2.html")

Q2¶

This is the interactive plot in my github public repository
https://github.com/mjin19/ds4ph/blob/main/q2.html
As this could be a little too large to open directly, this link could help open it in a web browser
https://htmlpreview.github.io/?https://github.com/mjin19/ds4ph/blob/main/q2.html

Q3¶

Here I showed how I created a database and read three data files in the database using sqlite:

(base) mujin@Mus-MacBook-Pro ~ % wget https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/county_pop_arcos.csv
wget https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/land_area.csv
wget https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/county_annual.csv --2023-02-18 23:51:04-- https://raw.githubusercontent.com/opencasestudies/ocs-bp-opioid-rural-urban/master/data/simpler_import/county_pop_arcos.csv
(base) mujin@Mus-MacBook-Pro ~ % sqlite3 opioid.db
SQLite version 3.39.3 2022-09-05 11:02:23
sqlite> .mode csv
sqlite> .import county_pop_arcos.csv population
sqlite> .import county_annual.csv annual
sqlite> .import land_area.csv land
sqlite> .tables
annual county_pop_arcos land_area
county_annual land population

In [7]:
# Do the data wrangling in Python
## create a connection
conn = sqlite3.connect('/Users/mujin/Library/CloudStorage/OneDrive-JohnsHopkins/academic file/ScM/Term 7/ds/Assignment 4/opioid.db')
## read tables into python
population = pd.read_sql("SELECT * FROM population", conn)
annual = pd.read_sql("SELECT * FROM annual", conn)
land = pd.read_sql("SELECT * FROM land", conn)
In [8]:
# check head rows
population.head(10)
Out[8]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE countyfips STATE COUNTY county_name NAME variable year population
0 1 AUTAUGA AL 1001 1 1 Autauga Autauga County, Alabama B01003_001 2006 51328
1 2 BALDWIN AL 1003 1 3 Baldwin Baldwin County, Alabama B01003_001 2006 168121
2 3 BARBOUR AL 1005 1 5 Barbour Barbour County, Alabama B01003_001 2006 27861
3 4 BIBB AL 1007 1 7 Bibb Bibb County, Alabama B01003_001 2006 22099
4 5 BLOUNT AL 1009 1 9 Blount Blount County, Alabama B01003_001 2006 55485
5 6 BULLOCK AL 1011 1 11 Bullock Bullock County, Alabama B01003_001 2006 10776
6 7 BUTLER AL 1013 1 13 Butler Butler County, Alabama B01003_001 2006 20815
7 8 CALHOUN AL 1015 1 15 Calhoun Calhoun County, Alabama B01003_001 2006 115388
8 9 CHAMBERS AL 1017 1 17 Chambers Chambers County, Alabama B01003_001 2006 34945
9 10 CHEROKEE AL 1019 1 19 Cherokee Cherokee County, Alabama B01003_001 2006 25466
In [9]:
annual.head(10)
Out[9]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
0 1 ABBEVILLE SC 2006 877 363620.0 45001.0
1 2 ABBEVILLE SC 2007 908 402940.0 45001.0
2 3 ABBEVILLE SC 2008 871 424590.0 45001.0
3 4 ABBEVILLE SC 2009 930 467230.0 45001.0
4 5 ABBEVILLE SC 2010 1197 539280.0 45001.0
5 6 ABBEVILLE SC 2011 1327 566560.0 45001.0
6 7 ABBEVILLE SC 2012 1509 589010.0 45001.0
7 8 ABBEVILLE SC 2013 1572 596420.0 45001.0
8 9 ABBEVILLE SC 2014 1558 641350.0 45001.0
9 10 ACADIA LA 2006 5802 1969720.0 22001.0
In [10]:
land.head(10)
Out[10]:
Unnamed: 0 Areaname STCOU LND010190F LND010190D LND010190N1 LND010190N2 LND010200F LND010200D LND010200N1 ... LND110210N1 LND110210N2 LND210190F LND210190D LND210190N1 LND210190N2 LND210200F LND210200D LND210200N1 LND210200N2
0 1 UNITED STATES 0 0 3787425.08 0 0 0 3794083.06 0 ... 0 0 0 251083.35 0 0 0 256644.62 0 0
1 2 ALABAMA 1000 0 52422.94 0 0 0 52419.02 0 ... 0 0 0 1672.71 0 0 0 1675.01 0 0
2 3 Autauga, AL 1001 0 604.49 0 0 0 604.45 0 ... 0 0 0 8.48 0 0 0 8.48 0 0
3 4 Baldwin, AL 1003 0 2027.08 0 0 0 2026.93 0 ... 0 0 0 430.55 0 0 0 430.58 0 0
4 5 Barbour, AL 1005 0 904.59 0 0 0 904.52 0 ... 0 0 0 19.59 0 0 0 19.61 0 0
5 6 Bibb, AL 1007 0 625.50 0 0 0 626.16 0 ... 0 0 0 3.14 0 0 0 3.14 0 0
6 7 Blount, AL 1009 0 650.65 0 0 0 650.60 0 ... 0 0 0 4.97 0 0 0 5.02 0 0
7 8 Bullock, AL 1011 0 626.11 0 0 0 626.06 0 ... 0 0 0 1.04 0 0 0 1.04 0 0
8 9 Butler, AL 1013 0 777.99 0 0 0 777.92 0 ... 0 0 0 1.05 0 0 0 1.05 0 0
9 10 Calhoun, AL 1015 0 612.35 0 0 0 612.32 0 ... 0 0 0 3.85 0 0 0 3.86 0 0

10 rows × 35 columns

In [11]:
# print out some of the missing data in the annual dataset
annual[annual['countyfips'].isna()].head(10)
Out[11]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
187 188 ADJUNTAS PR 2006 147 102800.0 NaN
188 189 ADJUNTAS PR 2007 153 104800.0 NaN
189 190 ADJUNTAS PR 2008 153 45400.0 NaN
190 191 ADJUNTAS PR 2009 184 54200.0 NaN
191 192 ADJUNTAS PR 2010 190 56200.0 NaN
192 193 ADJUNTAS PR 2011 186 65530.0 NaN
193 194 ADJUNTAS PR 2012 138 57330.0 NaN
194 195 ADJUNTAS PR 2013 138 65820.0 NaN
195 196 ADJUNTAS PR 2014 90 59490.0 NaN
196 197 AGUADA PR 2006 160 49200.0 NaN
In [12]:
# Place other than Puerto Rico (PR)
annual[(annual['countyfips'].isna()) & (annual['BUYER_STATE'] != 'PR')].head(10)
Out[12]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
10071 10072 GUAM GU 2006 319 265348.0 NaN
10072 10073 GUAM GU 2007 330 275600.0 NaN
10073 10074 GUAM GU 2008 313 286900.0 NaN
10074 10075 GUAM GU 2009 390 355300.0 NaN
10075 10076 GUAM GU 2010 510 413800.0 NaN
10076 10077 GUAM GU 2011 559 475600.0 NaN
10077 10078 GUAM GU 2012 616 564800.0 NaN
10078 10079 GUAM GU 2013 728 623200.0 NaN
10079 10080 GUAM GU 2014 712 558960.0 NaN
17429 17430 MONTGOMERY AR 2006 469 175390.0 NaN
In [13]:
# update countyfips of Montgomery
annual.loc[(annual['BUYER_STATE'] == 'AR') & (annual['BUYER_COUNTY'] == 'MONTGOMERY'), 'countyfips'] = '05097'
annual[(annual['countyfips'].isna()) & (annual['BUYER_STATE'] != 'PR')].head(10)
Out[13]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
10071 10072 GUAM GU 2006 319 265348.0 NaN
10072 10073 GUAM GU 2007 330 275600.0 NaN
10073 10074 GUAM GU 2008 313 286900.0 NaN
10074 10075 GUAM GU 2009 390 355300.0 NaN
10075 10076 GUAM GU 2010 510 413800.0 NaN
10076 10077 GUAM GU 2011 559 475600.0 NaN
10077 10078 GUAM GU 2012 616 564800.0 NaN
10078 10079 GUAM GU 2013 728 623200.0 NaN
10079 10080 GUAM GU 2014 712 558960.0 NaN
18501 18502 NORTHERN MARIANA ISLANDS MP 2006 165 117850.0 NaN
In [14]:
# check for missing data
annual[annual['BUYER_COUNTY'].isna()]
Out[14]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
27741 27742 None AE 2006 2 330.0 NaN
27742 27743 None CA 2006 47 12600.0 NaN
27743 27744 None CT 2006 305 78700.0 NaN
27744 27745 None CT 2007 112 30900.0 NaN
27745 27746 None CT 2008 48 15000.0 NaN
27746 27747 None FL 2006 9 900.0 NaN
27747 27748 None FL 2007 7 700.0 NaN
27748 27749 None GA 2006 114 51700.0 NaN
27749 27750 None IA 2006 7 2300.0 NaN
27750 27751 None IN 2006 292 39300.0 NaN
27751 27752 None MA 2006 247 114900.0 NaN
27752 27753 None NV 2006 380 173600.0 NaN
27753 27754 None NV 2007 447 200600.0 NaN
27754 27755 None NV 2008 5 2200.0 NaN
27755 27756 None OH 2006 23 5100.0 NaN
27756 27757 None PR 2006 10 17800.0 NaN
27757 27758 None PR 2007 2 1300.0 NaN
In [15]:
# drop records with missing county data
annual = annual.dropna(subset=['BUYER_COUNTY'])
In [16]:
# check for missing data again
annual[annual['BUYER_COUNTY'].isna()]
Out[16]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips
In [17]:
# select three needed columns
land_area = land[['Areaname', 'STCOU', 'LND110210D']].copy()
# check the head rows
land_area.head(10)
Out[17]:
Areaname STCOU LND110210D
0 UNITED STATES 0 3531905.43
1 ALABAMA 1000 50645.33
2 Autauga, AL 1001 594.44
3 Baldwin, AL 1003 1589.78
4 Barbour, AL 1005 884.88
5 Bibb, AL 1007 622.58
6 Blount, AL 1009 644.78
7 Bullock, AL 1011 622.81
8 Butler, AL 1013 776.83
9 Calhoun, AL 1015 605.87
In [18]:
# rename the STCOU column to countyfips
land_area = land_area.rename(columns={'STCOU': 'countyfips'})
# check the head rows
land_area.head(10)
Out[18]:
Areaname countyfips LND110210D
0 UNITED STATES 0 3531905.43
1 ALABAMA 1000 50645.33
2 Autauga, AL 1001 594.44
3 Baldwin, AL 1003 1589.78
4 Barbour, AL 1005 884.88
5 Bibb, AL 1007 622.58
6 Blount, AL 1009 644.78
7 Bullock, AL 1011 622.81
8 Butler, AL 1013 776.83
9 Calhoun, AL 1015 605.87
In [19]:
# Combine the two datasets
county_info = pd.merge(population, land_area, on='countyfips', how='left')
county_info.head(10)
Out[19]:
Unnamed: 0 BUYER_COUNTY BUYER_STATE countyfips STATE COUNTY county_name NAME variable year population Areaname LND110210D
0 1 AUTAUGA AL 1001 1 1 Autauga Autauga County, Alabama B01003_001 2006 51328 Autauga, AL 594.44
1 2 BALDWIN AL 1003 1 3 Baldwin Baldwin County, Alabama B01003_001 2006 168121 Baldwin, AL 1589.78
2 3 BARBOUR AL 1005 1 5 Barbour Barbour County, Alabama B01003_001 2006 27861 Barbour, AL 884.88
3 4 BIBB AL 1007 1 7 Bibb Bibb County, Alabama B01003_001 2006 22099 Bibb, AL 622.58
4 5 BLOUNT AL 1009 1 9 Blount Blount County, Alabama B01003_001 2006 55485 Blount, AL 644.78
5 6 BULLOCK AL 1011 1 11 Bullock Bullock County, Alabama B01003_001 2006 10776 Bullock, AL 622.81
6 7 BUTLER AL 1013 1 13 Butler Butler County, Alabama B01003_001 2006 20815 Butler, AL 776.83
7 8 CALHOUN AL 1015 1 15 Calhoun Calhoun County, Alabama B01003_001 2006 115388 Calhoun, AL 605.87
8 9 CHAMBERS AL 1017 1 17 Chambers Chambers County, Alabama B01003_001 2006 34945 Chambers, AL 596.53
9 10 CHEROKEE AL 1019 1 19 Cherokee Cherokee County, Alabama B01003_001 2006 25466 Cherokee, AL 553.70
In [20]:
# check results
print(len(land))
print(len(land_area))
print(len(population))
print(len(county_info))
3198
3198
28265
28265
In [21]:
fig = px.scatter(annual, x='year', y='count',color='BUYER_STATE', labels={'BUYER_STATE': 'State'}, render_mode='svg')
fig.update_layout(height=800, width=1000)
fig.show()
#fig.write_html("/Users/mujin/Library/CloudStorage/OneDrive-JohnsHopkins/academic file/ScM/Term 7/ds/Assignment 4/q3.html")

Q4¶

This is the interactive plot in my github public repository
https://github.com/mjin19/ds4ph/blob/main/q4.html
As this could be a little too large to open directly, this link could help open it in a web browser
https://htmlpreview.github.io/?https://github.com/mjin19/ds4ph/blob/main/q4.html